# -*- coding:utf8 -*-
# !/usr/bin/env python

"""
#全国企业信用信息公示系统（台湾）
"""

# import requesocks as requests
# import ghost
import requests
from bs4 import BeautifulSoup
import re
import sys
import tw_table
import json
import copy
import traceback
from scpy.logger import get_logger

from utils import kill_captcha

import tw_template_dict

reload(sys)
sys.setdefaultencoding("utf-8")

logger = get_logger(__file__)

ua = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36"
headers = {
    # "Accept": "image/webp,image/*,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, sdch",
    "Accept-Language": "zh-CN,zh;q=0.8",
    "Connection": "keep-alive",
    "Host": "gcis.nat.gov.tw",
    "Referer": "http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoListAction.do",
    "User-Agent": ua,
}

req = requests.session()
req.headers = headers


def download_captcha_kill(companyName):
    index_url = "http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoListAction.do"
    captcha_url = "http://gcis.nat.gov.tw/pub/kaptcha.jpg"
    # com_list_url = "http://gcis.nat.gov.tw/pub/cmpy/reportReg.jsp"

    index_res = req.get(index_url).content
    query_key = re.findall('''name="queryKey" value='(.*?)'/>''', index_res)

    captcha = req.get(captcha_url).content
    # with open("./tw.jpg", "wb") as fp:
    #     fp.write(captcha)
    #
    # res_code = raw_input("captcha=")
    try:
        res_code = kill_captcha(captcha, 'tw', 'jpg')
        # print 'res code: ', res_code
    except Exception, e:
        logger.error("破解验证码的服务出现异常")
        logger.error(e)
        raise e
    if not res_code or res_code == 'wrong' or len(res_code) > 100:
        logger.info('验证码为:%s' % res_code)
        logger.error("破解验证码的服务出现异常,可能是下载的验证码错误，也可能破解服务出现异常！")
        return ''  # 返回空字符串，用于重复破解

    check_data = {
        "method": "query",
        "otherEnterFlag": "false",
        "useEUC": "N",
        "isShowEUC": "N",
        "queryKey": query_key[0],
        "selCmpyType": "1",
        "selQueryType": "2",
        "queryStr": companyName,
        "brBanNo": "",
        "imageCode": res_code,
    }

    check_res = req.post(index_url, data=check_data).content
    re.findall("span.*?w-18ared.*?/span", check_res)
    if "驗證碼錯誤，請重新輸入！" in str(BeautifulSoup(check_res, "html5lib")):
        logger.info("驗證碼錯誤")
        return ""

    if "秒後，為您自動轉跳新站。" in str(BeautifulSoup(check_res, "html5lib")):
        logger.info("秒後，為您自動轉跳新站。")
        raise Exception("web error!")

    if "查無資料！" in str(BeautifulSoup(check_res, "html5lib")):
        logger.info("查無資料！")
        return None
    # com_url = re.findall('''action="(.*?)"''', check_res)

    com_url = "http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoAction.do?method=detail&banNo=%s" % (companyName)
    com_res = req.get(com_url).content
    com_soup = BeautifulSoup(com_res, "html5lib")

    # with open("./raw_html.html", "w") as fp:
    #     fp.write(str(com_soup))

    branch_url = "http://gcis.nat.gov.tw/pub/cmpy/branInfoListAction.do?method=query&banNo=%s&from=" % (companyName)
    branch_res = req.get(branch_url).content
    branch_soup = BeautifulSoup(branch_res, "html5lib")
    # with open("./raw_branch.html", "w") as fp:
    #     fp.write(str(branch_soup))

    raw_dict = {
        "province": "tw",
        "type": "1",
        "html": {"base": str(com_soup), "branch": str(branch_soup)},
        "yearList": [],
        "keyword": companyName,
        "companyName": companyName,
        "json": "",
    }

    return raw_dict


def clean_tr(table_html, word):
    tr_s = re.findall("<tr.*?%s.*?/tr>" % word, table_html, re.S)
    if tr_s and len(tr_s) == 1:
        return tr_s[0]
    else:
        raise Exception("clean tr error")


def parse_base(raw_html):
    raw_soup = BeautifulSoup(raw_html, "html5lib")
    table_s = raw_soup.find_all("table")
    # table_s = raw_soup.find_all("table", attrs={"id":"Tab01"})

    base_table = str(table_s[8]) if len(table_s) > 8 else ""
    base_dict = tw_table.basic(base_table)
    operate_table = str(table_s[9].text).replace(" ", "") if len(table_s) > 9 else ""
    operate_table = re.sub("\n+", "\n", re.sub("\xc2\xa0\n", "", operate_table))
    base_dict[0]["operateScope"] = operate_table
    # print json.dumps(base_dict, indent=4, ensure_ascii=False)
    return base_dict


def parse_person(raw_html):
    raw_soup = BeautifulSoup(raw_html, "html5lib")
    table_s = raw_soup.find_all("table")
    person_table = str(table_s[13]) if len(table_s) > 13 else ""
    person_dict = tw_table.personList(person_table)
    # print json.dumps(person_dict, indent=4, ensure_ascii=False)

    return person_dict


def parse_branch(raw_html):
    raw_soup = BeautifulSoup(raw_html, "html5lib")
    table_s = raw_soup.find_all("table")
    branch_table = str(table_s[3]) if len(table_s) > 3 else ""
    branch_dict = tw_table.filiationList(branch_table)
    # print json.dumps(branch_dict, indent=4, ensure_ascii=False)

    return branch_dict


def parse(raw_dict):
    if not raw_dict:
        raise Exception("input error")
    raw_html = raw_dict.get("html", "")
    if not raw_html:
        raise Exception("raw_base error")

    res_base_dict = copy.deepcopy(tw_template_dict.void_base_dict)

    raw_base = raw_html.get("base", "")
    raw_branch = raw_html.get("branch", "")

    res_base_dict["basicList"] = parse_base(raw_base)
    res_base_dict["personList"] = parse_person(raw_base)
    res_base_dict["filiationList"] = parse_branch(raw_branch)
    res_base_dict["province"] = "tw"
    print json.dumps(res_base_dict, indent=4, ensure_ascii=False)

    return res_base_dict


def search2(company_name, MAXTIME=40):
    raw_dict = ""  # 初值
    a_time = MAXTIME
    while a_time > 0:
        if raw_dict is None:  # 公司不存在
            return None
        elif raw_dict == "":
            if a_time < MAXTIME:
                logger.error("重复破解验证码!当前设定重复破解次数为:%s, 剩余次数为:%s " % (MAXTIME, a_time))
            a_time -= 1
            try:
                raw_dict = download_captcha_kill(company_name)
            except Exception, e:
                traceback.print_exc(e)
                raise e
        else:
            break
    if a_time <= 1 and raw_dict == "":
        raise Exception("多次破解验证码错误,当前设置次数为：%s" % MAXTIME)
    else:
        try:
            res_base_dict = parse(raw_dict)
            basic_list = res_base_dict.get("basicList", [])
            if basic_list:
                res_company_name = basic_list[0].get("enterpriseName", "")
                raw_dict["companyName"] = res_company_name
            return raw_dict, res_base_dict
        except Exception, e:
            traceback.print_exc(e)
            return raw_dict, None


def test():
    with open("./raw_html.html", "r") as fp:
        raw_base = fp.read()

    with open("./raw_branch.html", "r") as fp:
        raw_branch = fp.read()

    raw_dict = {
        "province": "tw",
        "type": "1",
        "html": {"base": raw_base, "branch": raw_branch},
        "yearList": [],
        "keyword": "04541302",
        "companyName": "04541302",
        "json": "",
    }

    return raw_dict


def gov_info(ban_no, company_name):
    index_url = "http://gcis.nat.gov.tw/SearchAgent/SA_Index.jsp"
    req.headers = {
        "Host": "gcis.nat.gov.tw",
        "Origin": "http://gcis.nat.gov.tw",
        "Referer": "http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoAction.do?method=detail&banNo=%s" % ban_no,
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": ua,
    }

    index_data = {
        "c_name": company_name,
        "c_id": ban_no,
        "RetURL": "http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoAction.do?method=detail&banNo=%s" % ban_no
    }

    index_res = req.post(index_url, data=index_data).content

    patten_data = {
        "c_name": company_name,
        # fdr_7:7
        "fdr_8": "8",
        "c_id": ban_no,
        "RetURL": "http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoAction.do?method=detail&banNo=%s" % ban_no,
        "from": "O",
        # "fdr_array":7,
        "fdr_array": "8",
    }
    patten_url = "http://gcis.nat.gov.tw/SearchAgent/SA_Query.jsp?from=O"
    patten_res = req.post(patten_url, data=patten_data).content

    b0_html_url = re.findall('''<div id="b0".*?href="(.*?)">''', patten_res)
    b0_html_url = "http://gcis.nat.gov.tw/SearchAgent/" + b0_html_url[0] if b0_html_url else ""
    if b0_html_url:
        b0_res = req.get(b0_html_url).content
        patten_url_2 = "http://tmsearch.tipo.gov.tw/TIPO_DR/servlet/InitApplicantIPOList"
        patten_data_2 = re.findall('''<input type="hidden" name="(.*?)" value="(.*?)">''', b0_res)
        patten_res_2 = req.post(patten_url_2, data=dict(patten_data_2))

        print patten_res_2

        # import pdb
        # pdb.set_trace()

    #
    # def patten():
    #     # 专利
    #

    com_list_url = "http://gcis.nat.gov.tw/pub/cmpy/reportReg.jsp"

    com_list_data = {
        'csrfPreventionSalt': '%5BLjava.lang.String%3B%40822391',
        'org': 'AL',
        'YYYMM': 'xxx',
        'type': 'xxx',
        'Submit': '%ACd%B8%DF',
    }
    pdf_html = req.post(com_list_url, data=com_list_data).content

    pdf_list = re.findall('''href=\'(.*?pdf)''', pdf_html)

    pdf_list = map(lambda x: x.replace("./", "http://gcis.nat.gov.tw/pub/cmpy/"), pdf_list)
    for url in pdf_list:
        file_name = re.findall("fileName=(.*?pdf)", url)[0]
        pdf_file = req.get(url).content
        with open("./pdf/%s" % file_name, "wb") as fp:
            fp.write(pdf_file)




            # req.proxies = {
            #     'http': 'socks5://127.0.0.1:1080',
            #     # 'https': 'socks5://127.0.0.1:1080'
            # }

            # gh = ghost.Ghost()
            # req = ghost.Session(gh, user_agent=ua)

            # req.set_proxy(
            #     type_="http",
            #     host='127.0.0.1',
            #     port=1080,
            #     # user='aes-256-cfb',
            #     # password='1qaz2wsx',
            # )



            # ip_url = "http://1212.ip138.com/ic.asp"
            #
            # # page, extra_resources = req.open("http://gcis.nat.gov.tw/pub/cmpy/cmpyInfoListAction.do")
            # # req.open(ip_url)
            #
            # # import pdb
            # # pdb.set_trace()
            #
            # # index_res = req.get(index_url)
            # img_res = req.get(captcha_url).content
            # return img_res



            # print img_res


if __name__ == "__main__":
    res = search2(company_name="04541302", MAXTIME=40)
    print json.dumps(res, ensure_ascii=False, indent=4)
    # download_captcha_kill("04541302")

    # test_dict = test()
    # parse(test_dict)
    # ban_no = "04541302"
    # company_name = "鴻海精密工業股份有限公司"
    # gov_info(ban_no, company_name)





    download_captcha_kill("")

    # for i in range(0, 101):
    #     img = download_captcha_kill("")
    #     with open("%s.jpg" % (i), "w") as fp:
    #         fp.write(img)
